O objetivo do desafio House prices é prever o valor de venda de um imóvel de acordo com as 79 variáveis do banco de dados. O desafio desponibiliza 4 arquivos, um dataset de treino e um dataset de teste ambos em csv, um arquivo txt com a descrição das colunas do dataset treino e um arquivo csv exemplo de como o resultado deve ser submetido. Para acessar mais informações sobre o desafio e fazer download dos dados basta acessar o link.
# Carregando o tidyverse
library(tidyverse)
── Attaching core tidyverse packages ──────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.0 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.1 ✔ tibble 3.1.8
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
# Setando o diretório de trabalho
setwd(dir = "C:/Users/rodolfo.paula/Desktop/PESSOAL_RODOLFO/Better Decisions/scripts/kaggle/house_prices")
# Lendo os arquivos de treino e de teste
train = data.frame(read.csv("train.csv"))
test = data.frame(read.csv("test.csv"))
# Visualizando as dimensões dos arquivos
dim(train)
[1] 1460 81
dim(test)
[1] 1459 80
# Visualizando o dataset de treino
glimpse(train)
Rows: 1,460
Columns: 81
$ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60,…
$ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, …
$ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10…
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg",…
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl",…
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
$ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Ins…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
$ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoR…
$ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
$ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story"…
$ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, …
$ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, …
$ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, …
$ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, …
$ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Ga…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com…
$ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "Vin…
$ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "Vin…
$ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace",…
$ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, …
$ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", …
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "…
$ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", …
$ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", …
$ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", …
$ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ",…
$ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851,…
$ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",…
$ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0,…
$ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140,…
$ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952,…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB…
$ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022…
$ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, …
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, …
$ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, …
$ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, …
$ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
$ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, …
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, …
$ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", …
$ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5,…
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
$ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, …
$ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA"…
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd"…
$ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, …
$ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn",…
$ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, …
$ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, …
$ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, …
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0…
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA,…
$ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, …
$ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0,…
$ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, …
$ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, …
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal…
$ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, …
# Obtendo um sumário dos dados de treino
summary(train)
Id MSSubClass MSZoning LotFrontage
Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
Median : 730.5 Median : 50.0 Mode :character Median : 69.00
Mean : 730.5 Mean : 56.9 Mean : 70.05
3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
Max. :1460.0 Max. :190.0 Max. :313.00
NA's :259
LotArea Street Alley
Min. : 1300 Length:1460 Length:1460
1st Qu.: 7554 Class :character Class :character
Median : 9478 Mode :character Mode :character
Mean : 10517
3rd Qu.: 11602
Max. :215245
LotShape LandContour Utilities
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
LotConfig LandSlope Neighborhood
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Condition1 Condition2 BldgType
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
HouseStyle OverallQual OverallCond YearBuilt
Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
Mode :character Median : 6.000 Median :5.000 Median :1973
Mean : 6.099 Mean :5.575 Mean :1971
3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
Max. :10.000 Max. :9.000 Max. :2010
YearRemodAdd RoofStyle RoofMatl
Min. :1950 Length:1460 Length:1460
1st Qu.:1967 Class :character Class :character
Median :1994 Mode :character Mode :character
Mean :1985
3rd Qu.:2004
Max. :2010
Exterior1st Exterior2nd MasVnrType
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
MasVnrArea ExterQual ExterCond
Min. : 0.0 Length:1460 Length:1460
1st Qu.: 0.0 Class :character Class :character
Median : 0.0 Mode :character Mode :character
Mean : 103.7
3rd Qu.: 166.0
Max. :1600.0
NA's :8
Foundation BsmtQual BsmtCond
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
BsmtExposure BsmtFinType1 BsmtFinSF1
Length:1460 Length:1460 Min. : 0.0
Class :character Class :character 1st Qu.: 0.0
Mode :character Mode :character Median : 383.5
Mean : 443.6
3rd Qu.: 712.2
Max. :5644.0
BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
Length:1460 Min. : 0.00 Min. : 0.0 Min. : 0.0
Class :character 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8
Mode :character Median : 0.00 Median : 477.5 Median : 991.5
Mean : 46.55 Mean : 567.2 Mean :1057.4
3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
Max. :1474.00 Max. :2336.0 Max. :6110.0
Heating HeatingQC CentralAir
Length:1460 Length:1460 Length:1460
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
Length:1460 Min. : 334 Min. : 0 Min. : 0.000
Class :character 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
Mode :character Median :1087 Median : 0 Median : 0.000
Mean :1163 Mean : 347 Mean : 5.845
3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
Max. :4692 Max. :2065 Max. :572.000
GrLivArea BsmtFullBath BsmtHalfBath FullBath
Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
Median :1464 Median :0.0000 Median :0.00000 Median :2.000
Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
Min. :0.0000 Min. :0.000 Min. :0.000 Length:1460
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Class :character
Median :0.0000 Median :3.000 Median :1.000 Mode :character
Mean :0.3829 Mean :2.866 Mean :1.047
3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
Max. :2.0000 Max. :8.000 Max. :3.000
TotRmsAbvGrd Functional Fireplaces FireplaceQu
Min. : 2.000 Length:1460 Min. :0.000 Length:1460
1st Qu.: 5.000 Class :character 1st Qu.:0.000 Class :character
Median : 6.000 Mode :character Median :1.000 Mode :character
Mean : 6.518 Mean :0.613
3rd Qu.: 7.000 3rd Qu.:1.000
Max. :14.000 Max. :3.000
GarageType GarageYrBlt GarageFinish GarageCars
Length:1460 Min. :1900 Length:1460 Min. :0.000
Class :character 1st Qu.:1961 Class :character 1st Qu.:1.000
Mode :character Median :1980 Mode :character Median :2.000
Mean :1979 Mean :1.767
3rd Qu.:2002 3rd Qu.:2.000
Max. :2010 Max. :4.000
NA's :81
GarageArea GarageQual GarageCond
Min. : 0.0 Length:1460 Length:1460
1st Qu.: 334.5 Class :character Class :character
Median : 480.0 Mode :character Mode :character
Mean : 473.0
3rd Qu.: 576.0
Max. :1418.0
PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
Length:1460 Min. : 0.00 Min. : 0.00 Min. : 0.00
Class :character 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
Mode :character Median : 0.00 Median : 25.00 Median : 0.00
Mean : 94.24 Mean : 46.66 Mean : 21.95
3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
Max. :857.00 Max. :547.00 Max. :552.00
X3SsnPorch ScreenPorch PoolArea PoolQC
Min. : 0.00 Min. : 0.00 Min. : 0.000 Length:1460
1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Class :character
Median : 0.00 Median : 0.00 Median : 0.000 Mode :character
Mean : 3.41 Mean : 15.06 Mean : 2.759
3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
Max. :508.00 Max. :480.00 Max. :738.000
Fence MiscFeature MiscVal
Length:1460 Length:1460 Min. : 0.00
Class :character Class :character 1st Qu.: 0.00
Mode :character Mode :character Median : 0.00
Mean : 43.49
3rd Qu.: 0.00
Max. :15500.00
MoSold YrSold SaleType SaleCondition
Min. : 1.000 Min. :2006 Length:1460 Length:1460
1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
Median : 6.000 Median :2008 Mode :character Mode :character
Mean : 6.322 Mean :2008
3rd Qu.: 8.000 3rd Qu.:2009
Max. :12.000 Max. :2010
SalePrice
Min. : 34900
1st Qu.:129975
Median :163000
Mean :180921
3rd Qu.:214000
Max. :755000
Utilizando o sumário fica claro que o banco de dados de treino possui muitas variáveis. Vamos incialmente separar o banco de dados em variáveis qualitativas e variáveis quantitativas, assim se torna mais facil o tratamento e limpeza dos dados.
Para as variáveis qualitativas que apresentem valores faltantes ou NA`s vamos substituir pela moda daquela variável e para as variáveis quantitativas de que apresentem valores faltantes vamos substituir pela média.
A coluna Id será eliminada por ser um valor sequencial que nao representa nehuma informação útil como variável explicativa capaz de influenciar no valor predito do preço de venda de uma casa.
Para separar o dataset em dados quantitativos e qualitativos o melhor caminho é usara a função select_if() do pacote dplyr
# Separando o dataframe em train quanti e train quali usando a função select_if
train_quanti <- select_if(train, is.numeric)
train_quali <- select_if(train, is.character)
#Visualizando os dados
glimpse(train_quali)
Rows: 1,460
Columns: 43
$ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg",…
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl",…
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
$ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Ins…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
$ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoR…
$ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
$ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story"…
$ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Ga…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com…
$ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "Vin…
$ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "Vin…
$ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace",…
$ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", …
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "…
$ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", …
$ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", …
$ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", …
$ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ",…
$ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", …
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB…
$ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", …
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
$ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA"…
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd"…
$ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn",…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA,…
$ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, …
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal…
# Procurando Valores missing
colSums(is.na(train_quali))
MSZoning Street Alley LotShape LandContour
0 0 1369 0 0
Utilities LotConfig LandSlope Neighborhood Condition1
0 0 0 0 0
Condition2 BldgType HouseStyle RoofStyle RoofMatl
0 0 0 0 0
Exterior1st Exterior2nd MasVnrType ExterQual ExterCond
0 0 8 0 0
Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
0 37 37 38 37
BsmtFinType2 Heating HeatingQC CentralAir Electrical
38 0 0 0 1
KitchenQual Functional FireplaceQu GarageType GarageFinish
0 0 690 81 81
GarageQual GarageCond PavedDrive PoolQC Fence
81 81 0 1453 1179
MiscFeature SaleType SaleCondition
1406 0 0
Com essa analise a cima sabemos o seguinte sobre os nossos dados categóricos:
# Deletando as colunas que não são de interesse para o dataset.
train_quali <-train_quali %>% select(-c(Alley, PoolQC, Fence, MiscFeature))
print(dim(train_quali))
[1] 1460 39
# Lista com as variáveis : MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,
# FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond
##############################################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$MasVnrType)
[1] "BrkFace" "None" "Stone" "BrkCmn" NA
sort(table(train_quali$MasVnrType))
BrkCmn Stone BrkFace None
15 128 445 864
# Alterando os valores de NA para o valor da Moda de MasVnrType.
train_quali <- train_quali %>% mutate(MasVnrType=coalesce(MasVnrType,"None"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$BsmtQual)
[1] "Gd" "TA" "Ex" NA "Fa"
sort(table(train_quali$BsmtQual))
Fa Ex Gd TA
35 121 618 649
# Alterando os valores de NA para o valor da Moda de BsmtQual.
train_quali <- train_quali %>%
mutate(BsmtQual=coalesce(BsmtQual,"TA"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$BsmtCond)
[1] "TA" "Gd" NA "Fa" "Po"
sort(table(train_quali$BsmtCond))
Po Fa Gd TA
2 45 65 1311
# Alterando os valores de NA para o valor da Moda de BsmtCond
train_quali <- train_quali %>%
mutate(BsmtCond=coalesce(BsmtCond,"TA"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$BsmtExposure)
[1] "No" "Gd" "Mn" "Av" NA
sort(table(train_quali$BsmtExposure))
Mn Gd Av No
114 134 221 953
# Alterando os valores de NA para o valor da Moda de BsmtExposure
train_quali <- train_quali %>%
mutate(BsmtExposure=coalesce(BsmtExposure,"No"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$BsmtFinType1)
[1] "GLQ" "ALQ" "Unf" "Rec" "BLQ" NA "LwQ"
sort(table(train_quali$BsmtFinType1))
LwQ Rec BLQ ALQ GLQ Unf
74 133 148 220 418 430
# Alterando os valores de NA para o valor da Moda de BsmtFinType1
train_quali <- train_quali %>%
mutate(BsmtFinType1=coalesce(BsmtFinType1,"Unf"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$BsmtFinType2)
[1] "Unf" "BLQ" NA "ALQ" "Rec" "LwQ" "GLQ"
sort(table(train_quali$BsmtFinType2))
GLQ ALQ BLQ LwQ Rec Unf
14 19 33 46 54 1256
# Alterando os valores de NA para o valor da Moda de BsmtFinType2
train_quali <- train_quali %>%
mutate(BsmtFinType2=coalesce(BsmtFinType2,"Unf"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$Electrical)
[1] "SBrkr" "FuseF" "FuseA" "FuseP" "Mix" NA
sort(table(train_quali$Electrical))
Mix FuseP FuseF FuseA SBrkr
1 3 27 94 1334
# Alterando os valores de NA para o valor da Moda de Electrical
train_quali <- train_quali %>%
mutate(Electrical=coalesce(Electrical,"SBrkr"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$FireplaceQu)
[1] NA "TA" "Gd" "Fa" "Ex" "Po"
sort(table(train_quali$FireplaceQu))
Po Ex Fa TA Gd
20 24 33 313 380
# Alterando os valores de NA para o valor da Moda de FireplaceQu
train_quali <- train_quali %>%
mutate(FireplaceQu=coalesce(FireplaceQu,"Gd"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$GarageType)
[1] "Attchd" "Detchd" "BuiltIn" "CarPort" NA "Basment"
[7] "2Types"
sort(table(train_quali$GarageType))
2Types CarPort Basment BuiltIn Detchd Attchd
6 9 19 88 387 870
# Alterando os valores de NA para o valor da Moda de GarageType
train_quali <- train_quali %>%
mutate(GarageType=coalesce(GarageType,"Attchd"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$GarageFinish)
[1] "RFn" "Unf" "Fin" NA
sort(table(train_quali$GarageFinish))
Fin RFn Unf
352 422 605
# Alterando os valores de NA para o valor da Moda de GarageFinish
train_quali <- train_quali %>%
mutate(GarageFinish=coalesce(GarageFinish,"Unf"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$GarageQual)
[1] "TA" "Fa" "Gd" NA "Ex" "Po"
sort(table(train_quali$GarageQual))
Ex Po Gd Fa TA
3 3 14 48 1311
# Alterando os valores de NA para o valor da Moda de GarageQual
train_quali <- train_quali %>%
mutate(GarageQual=coalesce(GarageQual,"TA"))
##############################################
# Contando os valores NA`s das variáveis com missing values
unique(train_quali$GarageCond)
[1] "TA" "Fa" NA "Gd" "Po" "Ex"
sort(table(train_quali$GarageCond))
Ex Po Gd Fa TA
2 7 9 35 1326
# Alterando os valores de NA para o valor da Moda de GarageCond
train_quali <- train_quali %>%
mutate(GarageCond=coalesce(GarageCond,"TA"))
No entanto, alterar na mão cada variável não é a solução mais inteligente, imagine um caso de um dataset com centenas ou até milhares de variáveis para serem alteradas.
O melhor a ser feito é loop do tipo for, capaz de varrer todas as variáveis, obter o valor de frequencia para cada valor e armazenar este com a moda e em seguida substituir os missing values de cada variável.
Vamos então carregar o banco de dados novamente para anular as transformações realizadas e separar em dados quantitativos e qualitativos e em seguida somar os valores NA`s das variáveis.
# Somando os valores NA`s para cada coluna
colSums(is.na(train_quali))
MSZoning Street LotShape LandContour Utilities
0 0 0 0 0
LotConfig LandSlope Neighborhood Condition1 Condition2
0 0 0 0 0
BldgType HouseStyle RoofStyle RoofMatl Exterior1st
0 0 0 0 0
Exterior2nd MasVnrType ExterQual ExterCond Foundation
0 0 0 0 0
BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
0 0 0 0 0
Heating HeatingQC CentralAir Electrical KitchenQual
0 0 0 0 0
Functional FireplaceQu GarageType GarageFinish GarageQual
0 0 0 0 0
GarageCond PavedDrive SaleType SaleCondition
0 0 0 0
# Montando um loop do tipo for
for(item in colnames(train_quali)){
x = data.frame(sort(table(train_quali[item])))
#print(x)
mode = tail(x, n = 1)
#print(mode)
moda = (mode[1,1])
moda <- (as.character(moda))
#print(moda)
train_quali <- train_quali %>%
mutate(across(item, ~ case_when(is.na(.) ~ moda, TRUE ~ .)))
}
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(item, ~case_when(is.na(.) ~ moda, TRUE ~ .))`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect
1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(item)
# Now:
data %>% select(all_of(item))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning
was generated.
# testando o resultado
unique(train_quali$Street)
[1] "Pave" "Grvl"
table(train_quali$Street)
Grvl Pave
6 1454
# Somando os valores NA`s para cada coluna para conferir os resultados
colSums(is.na(train_quali))
MSZoning Street LotShape LandContour Utilities
0 0 0 0 0
LotConfig LandSlope Neighborhood Condition1 Condition2
0 0 0 0 0
BldgType HouseStyle RoofStyle RoofMatl Exterior1st
0 0 0 0 0
Exterior2nd MasVnrType ExterQual ExterCond Foundation
0 0 0 0 0
BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
0 0 0 0 0
Heating HeatingQC CentralAir Electrical KitchenQual
0 0 0 0 0
Functional FireplaceQu GarageType GarageFinish GarageQual
0 0 0 0 0
GarageCond PavedDrive SaleType SaleCondition
0 0 0 0
Da mesma maneira que realizamos a limpeza dos dados faltantes no banco de dados qualitativos iniciamos aqui a limpeza de dados faltantes nos dados quantitativos
# Visualizando os dados
glimpse(train_quanti)
Rows: 1,460
Columns: 38
$ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60,…
$ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, …
$ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10…
$ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, …
$ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, …
$ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, …
$ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, …
$ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, …
$ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851,…
$ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0,…
$ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140,…
$ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952,…
$ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022…
$ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, …
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, …
$ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, …
$ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, …
$ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
$ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, …
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, …
$ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5,…
$ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, …
$ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, …
$ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, …
$ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205…
$ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, …
$ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, …
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0…
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0,…
$ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, …
$ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, …
$ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, …
# Somando os valores faltantes
colSums(is.na(train_quanti))
Id MSSubClass LotFrontage LotArea OverallQual
0 0 259 0 0
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
0 0 0 8 0
BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF
0 0 0 0 0
LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
0 0 0 0 0
HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
0 0 0 0 0
GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
81 0 0 0 0
EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
0 0 0 0 0
MoSold YrSold SalePrice
0 0 0
Com o resultado do somatório dos valores ausentes por variável do banco de dados quantitativos, observamos que apenas 3 variáveis possuem missing values. Sendo assim a melhor opção é utilizar a função mutate() e substituir os NA`s pela média entre as observações para cada variável.
# LotFrontage
train_quanti <- train_quanti %>%
mutate(LotFrontage=coalesce(LotFrontage,
mean(x =train_quanti$LotFrontage, na.rm = T )))
# MasVnrArea
train_quanti <- train_quanti %>%
mutate(MasVnrArea=coalesce(MasVnrArea,
mean(x =train_quanti$MasVnrArea, na.rm = T )))
# GarageYrBlt
train_quanti <- train_quanti %>%
mutate(GarageYrBlt=coalesce(GarageYrBlt,
mean(x =train_quanti$GarageYrBlt, na.rm = T )))
# Conferindo se ainda existe valores faltantes:
colSums(is.na(train_quanti))
Após a limpeza dos dados de forma adequada de acordo com o tipo de dado, onde para os dados categóricos nós utilizamos o valor da moda para substituir os missing values e o valor da media para substituir os dados ausente nos dados quantitativos, podemos com segurança unir esse bancos de dados em apenas um banco de dado já tratado e pronto para a etapa de análise de dados.
# Unindo os bancos de dados
train.clean <- cbind(train_quali,train_quanti)
names(train.clean)
[1] "MSZoning" "Street" "LotShape" "LandContour"
[5] "Utilities" "LotConfig" "LandSlope" "Neighborhood"
[9] "Condition1" "Condition2" "BldgType" "HouseStyle"
[13] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
[17] "MasVnrType" "ExterQual" "ExterCond" "Foundation"
[21] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1"
[25] "BsmtFinType2" "Heating" "HeatingQC" "CentralAir"
[29] "Electrical" "KitchenQual" "Functional" "FireplaceQu"
[33] "GarageType" "GarageFinish" "GarageQual" "GarageCond"
[37] "PavedDrive" "SaleType" "SaleCondition" "Id"
[41] "MSSubClass" "LotFrontage" "LotArea" "OverallQual"
[45] "OverallCond" "YearBuilt" "YearRemodAdd" "MasVnrArea"
[49] "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF"
[53] "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "GrLivArea"
[57] "BsmtFullBath" "BsmtHalfBath" "FullBath" "HalfBath"
[61] "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd" "Fireplaces"
[65] "GarageYrBlt" "GarageCars" "GarageArea" "WoodDeckSF"
[69] "OpenPorchSF" "EnclosedPorch" "X3SsnPorch" "ScreenPorch"
[73] "PoolArea" "MiscVal" "MoSold" "YrSold"
[77] "SalePrice"
Da mesma forma que procedemos com os dados de treino iremos abordar agora os dados de teste e deixar todos os nossos bancos de dados prontos para a etapa de análise exploratória de dados. A primeira etapa será a separação em dados qualitativos e dados quantitativos, seguido da limpeza dos valores ausentes e por fim a união dos bancos quali e quanti de teste em apenas um dataframe tratado.
# Visualizando os dados de teste
glimpse(test)
Rows: 1,459
Columns: 80
$ Id <int> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, …
$ MSSubClass <int> 20, 20, 60, 60, 120, 60, 20, 60, 20, 20, 120, 16…
$ MSZoning <chr> "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ LotFrontage <int> 80, 81, 74, 78, 43, 75, NA, 63, 85, 70, 26, 21, …
$ LotArea <int> 11622, 14267, 13830, 9978, 5005, 10000, 7980, 84…
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape <chr> "Reg", "IR1", "IR1", "IR1", "IR1", "IR1", "IR1",…
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "HLS", "Lvl", "Lvl",…
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
$ LotConfig <chr> "Inside", "Corner", "Inside", "Inside", "Inside"…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
$ Neighborhood <chr> "NAmes", "NAmes", "Gilbert", "Gilbert", "StoneBr…
$ Condition1 <chr> "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "TwnhsE", "1Fam"…
$ HouseStyle <chr> "1Story", "1Story", "2Story", "2Story", "1Story"…
$ OverallQual <int> 5, 6, 5, 6, 8, 6, 6, 6, 7, 4, 7, 6, 5, 6, 7, 9, …
$ OverallCond <int> 6, 6, 5, 6, 5, 5, 7, 5, 5, 5, 5, 5, 5, 6, 6, 5, …
$ YearBuilt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ YearRemodAdd <int> 1961, 1958, 1998, 1998, 1992, 1994, 2007, 1998, …
$ RoofStyle <chr> "Gable", "Hip", "Gable", "Gable", "Gable", "Gabl…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com…
$ Exterior1st <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ Exterior2nd <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ MasVnrType <chr> "None", "BrkFace", "None", "BrkFace", "None", "N…
$ MasVnrArea <int> 0, 108, 0, 20, 0, 0, 0, 0, 0, 0, 0, 504, 492, 0,…
$ ExterQual <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "TA", "TA", …
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "Gd", "TA", …
$ Foundation <chr> "CBlock", "CBlock", "PConc", "PConc", "PConc", "…
$ BsmtQual <chr> "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", "Gd", …
$ BsmtCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ BsmtExposure <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
$ BsmtFinType1 <chr> "Rec", "ALQ", "GLQ", "GLQ", "ALQ", "Unf", "ALQ",…
$ BsmtFinSF1 <int> 468, 923, 791, 602, 263, 0, 935, 0, 637, 804, 10…
$ BsmtFinType2 <chr> "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",…
$ BsmtFinSF2 <int> 144, 0, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, …
$ BsmtUnfSF <int> 270, 406, 137, 324, 1017, 763, 233, 789, 663, 0,…
$ TotalBsmtSF <int> 882, 1329, 928, 926, 1280, 763, 1168, 789, 1300,…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "TA", "TA", "Gd", "Ex", "Ex", "Gd", "Ex", "Gd", …
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB…
$ X1stFlrSF <int> 896, 1329, 928, 926, 1280, 763, 1187, 789, 1341,…
$ X2ndFlrSF <int> 0, 0, 701, 678, 0, 892, 0, 676, 0, 0, 0, 504, 56…
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ GrLivArea <int> 896, 1329, 1629, 1604, 1280, 1655, 1187, 1465, 1…
$ BsmtFullBath <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
$ BsmtHalfBath <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ FullBath <int> 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, …
$ HalfBath <int> 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, …
$ BedroomAbvGr <int> 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, …
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ KitchenQual <chr> "TA", "Gd", "TA", "Gd", "Gd", "TA", "TA", "TA", …
$ TotRmsAbvGrd <int> 5, 6, 6, 7, 5, 7, 6, 7, 5, 4, 5, 5, 6, 6, 4, 10,…
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
$ Fireplaces <int> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, …
$ FireplaceQu <chr> NA, NA, "TA", "Gd", NA, "TA", NA, "Gd", "Po", NA…
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd"…
$ GarageYrBlt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ GarageFinish <chr> "Unf", "Unf", "Fin", "Fin", "RFn", "Fin", "Fin",…
$ GarageCars <int> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 3, …
$ GarageArea <int> 730, 312, 482, 470, 506, 440, 420, 393, 506, 525…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ WoodDeckSF <int> 140, 393, 212, 360, 0, 157, 483, 0, 192, 240, 20…
$ OpenPorchSF <int> 0, 36, 34, 36, 82, 84, 21, 75, 0, 0, 68, 0, 0, 0…
$ EnclosedPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch <int> 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence <chr> "MnPrv", NA, "MnPrv", NA, NA, NA, "GdPrv", NA, N…
$ MiscFeature <chr> NA, "Gar2", NA, NA, NA, NA, "Shed", NA, NA, NA, …
$ MiscVal <int> 0, 12500, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, 0, …
$ MoSold <int> 6, 6, 3, 6, 1, 4, 3, 5, 2, 4, 6, 2, 3, 6, 6, 1, …
$ YrSold <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Normal", "Normal"…
# separando os dados de teste em quanti e quali
test_quanti <- select_if(test, is.numeric)
test_quali <- select_if(test, is.character)
# Visualizando o resultado
glimpse(test_quali)
Rows: 1,459
Columns: 43
$ MSZoning <chr> "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape <chr> "Reg", "IR1", "IR1", "IR1", "IR1", "IR1", "IR1",…
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "HLS", "Lvl", "Lvl",…
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
$ LotConfig <chr> "Inside", "Corner", "Inside", "Inside", "Inside"…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
$ Neighborhood <chr> "NAmes", "NAmes", "Gilbert", "Gilbert", "StoneBr…
$ Condition1 <chr> "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "TwnhsE", "1Fam"…
$ HouseStyle <chr> "1Story", "1Story", "2Story", "2Story", "1Story"…
$ RoofStyle <chr> "Gable", "Hip", "Gable", "Gable", "Gable", "Gabl…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com…
$ Exterior1st <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ Exterior2nd <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ MasVnrType <chr> "None", "BrkFace", "None", "BrkFace", "None", "N…
$ ExterQual <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "TA", "TA", …
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "Gd", "TA", …
$ Foundation <chr> "CBlock", "CBlock", "PConc", "PConc", "PConc", "…
$ BsmtQual <chr> "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", "Gd", …
$ BsmtCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ BsmtExposure <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
$ BsmtFinType1 <chr> "Rec", "ALQ", "GLQ", "GLQ", "ALQ", "Unf", "ALQ",…
$ BsmtFinType2 <chr> "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "TA", "TA", "Gd", "Ex", "Ex", "Gd", "Ex", "Gd", …
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB…
$ KitchenQual <chr> "TA", "Gd", "TA", "Gd", "Gd", "TA", "TA", "TA", …
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
$ FireplaceQu <chr> NA, NA, "TA", "Gd", NA, "TA", NA, "Gd", "Po", NA…
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd"…
$ GarageFinish <chr> "Unf", "Unf", "Fin", "Fin", "RFn", "Fin", "Fin",…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence <chr> "MnPrv", NA, "MnPrv", NA, NA, NA, "GdPrv", NA, N…
$ MiscFeature <chr> NA, "Gar2", NA, NA, NA, NA, "Shed", NA, NA, NA, …
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Normal", "Normal"…
glimpse(test_quanti)
Rows: 1,459
Columns: 37
$ Id <int> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, …
$ MSSubClass <int> 20, 20, 60, 60, 120, 60, 20, 60, 20, 20, 120, 16…
$ LotFrontage <int> 80, 81, 74, 78, 43, 75, NA, 63, 85, 70, 26, 21, …
$ LotArea <int> 11622, 14267, 13830, 9978, 5005, 10000, 7980, 84…
$ OverallQual <int> 5, 6, 5, 6, 8, 6, 6, 6, 7, 4, 7, 6, 5, 6, 7, 9, …
$ OverallCond <int> 6, 6, 5, 6, 5, 5, 7, 5, 5, 5, 5, 5, 5, 6, 6, 5, …
$ YearBuilt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ YearRemodAdd <int> 1961, 1958, 1998, 1998, 1992, 1994, 2007, 1998, …
$ MasVnrArea <int> 0, 108, 0, 20, 0, 0, 0, 0, 0, 0, 0, 504, 492, 0,…
$ BsmtFinSF1 <int> 468, 923, 791, 602, 263, 0, 935, 0, 637, 804, 10…
$ BsmtFinSF2 <int> 144, 0, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, …
$ BsmtUnfSF <int> 270, 406, 137, 324, 1017, 763, 233, 789, 663, 0,…
$ TotalBsmtSF <int> 882, 1329, 928, 926, 1280, 763, 1168, 789, 1300,…
$ X1stFlrSF <int> 896, 1329, 928, 926, 1280, 763, 1187, 789, 1341,…
$ X2ndFlrSF <int> 0, 0, 701, 678, 0, 892, 0, 676, 0, 0, 0, 504, 56…
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ GrLivArea <int> 896, 1329, 1629, 1604, 1280, 1655, 1187, 1465, 1…
$ BsmtFullBath <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
$ BsmtHalfBath <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ FullBath <int> 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, …
$ HalfBath <int> 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, …
$ BedroomAbvGr <int> 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, …
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ TotRmsAbvGrd <int> 5, 6, 6, 7, 5, 7, 6, 7, 5, 4, 5, 5, 6, 6, 4, 10,…
$ Fireplaces <int> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, …
$ GarageYrBlt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ GarageCars <int> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 3, …
$ GarageArea <int> 730, 312, 482, 470, 506, 440, 420, 393, 506, 525…
$ WoodDeckSF <int> 140, 393, 212, 360, 0, 157, 483, 0, 192, 240, 20…
$ OpenPorchSF <int> 0, 36, 34, 36, 82, 84, 21, 75, 0, 0, 68, 0, 0, 0…
$ EnclosedPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch <int> 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ MiscVal <int> 0, 12500, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, 0, …
$ MoSold <int> 6, 6, 3, 6, 1, 4, 3, 5, 2, 4, 6, 2, 3, 6, 6, 1, …
$ YrSold <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
# Tratando os dados qualitativos
colSums(is.na(test_quali))
MSZoning Street Alley LotShape LandContour
4 0 1352 0 0
Utilities LotConfig LandSlope Neighborhood Condition1
2 0 0 0 0
Condition2 BldgType HouseStyle RoofStyle RoofMatl
0 0 0 0 0
Exterior1st Exterior2nd MasVnrType ExterQual ExterCond
1 1 16 0 0
Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
0 44 45 44 42
BsmtFinType2 Heating HeatingQC CentralAir Electrical
42 0 0 0 0
KitchenQual Functional FireplaceQu GarageType GarageFinish
1 2 730 76 78
GarageQual GarageCond PavedDrive PoolQC Fence
78 78 0 1456 1169
MiscFeature SaleType SaleCondition
1408 1 0
# Observe que temos os mesmos problemas, existem variável que são praticamente nulas, sendo
# a melhor escolha deletar elas do nosso banco de dados.
test_quali <- test_quali %>% select(-c(Alley, PoolQC, Fence, MiscFeature))
glimpse(test_quali)
Rows: 1,459
Columns: 39
$ MSZoning <chr> "RH", "RL", "RL", "RL", "RL", "RL", "RL", "RL", …
$ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ LotShape <chr> "Reg", "IR1", "IR1", "IR1", "IR1", "IR1", "IR1",…
$ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "HLS", "Lvl", "Lvl",…
$ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub"…
$ LotConfig <chr> "Inside", "Corner", "Inside", "Inside", "Inside"…
$ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
$ Neighborhood <chr> "NAmes", "NAmes", "Gilbert", "Gilbert", "StoneBr…
$ Condition1 <chr> "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "TwnhsE", "1Fam"…
$ HouseStyle <chr> "1Story", "1Story", "2Story", "2Story", "1Story"…
$ RoofStyle <chr> "Gable", "Hip", "Gable", "Gable", "Gable", "Gabl…
$ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "Com…
$ Exterior1st <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ Exterior2nd <chr> "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "HdB…
$ MasVnrType <chr> "None", "BrkFace", "None", "BrkFace", "None", "N…
$ ExterQual <chr> "TA", "TA", "TA", "TA", "Gd", "TA", "TA", "TA", …
$ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "Gd", "TA", …
$ Foundation <chr> "CBlock", "CBlock", "PConc", "PConc", "PConc", "…
$ BsmtQual <chr> "TA", "TA", "Gd", "TA", "Gd", "Gd", "Gd", "Gd", …
$ BsmtCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ BsmtExposure <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
$ BsmtFinType1 <chr> "Rec", "ALQ", "GLQ", "GLQ", "ALQ", "Unf", "ALQ",…
$ BsmtFinType2 <chr> "LwQ", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf",…
$ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC <chr> "TA", "TA", "Gd", "Ex", "Ex", "Gd", "Ex", "Gd", …
$ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SB…
$ KitchenQual <chr> "TA", "Gd", "TA", "Gd", "Gd", "TA", "TA", "TA", …
$ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
$ FireplaceQu <chr> NA, NA, "TA", "Gd", NA, "TA", NA, "Gd", "Po", NA…
$ GarageType <chr> "Attchd", "Attchd", "Attchd", "Attchd", "Attchd"…
$ GarageFinish <chr> "Unf", "Unf", "Fin", "Fin", "RFn", "Fin", "Fin",…
$ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", …
$ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"…
$ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", …
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Normal", "Normal"…
# Aplicando o mesmo loop for para tratar os dado de teste qualitativos.
for(item in colnames(test_quali)){
#print(item)
x = data.frame(sort(table(test_quali[item])))
#print(x)
mode = tail(x, n = 1)
#print(mode)
moda = (mode[1,1])
moda <- as.character(moda)
#print(train_quali[item])
test_quali <- test_quali %>%
mutate(across(item, ~ case_when(is.na(.) ~ moda, TRUE ~ .)))
}
# Conferindo a soma dos valores nulos nas colunas após a transformção
unique(test_quali$MasVnrType)
[1] "None" "BrkFace" "Stone" "BrkCmn"
table(test_quali$MasVnrType)
BrkCmn BrkFace None Stone
10 434 894 121
colSums(is.na(test_quali))
MSZoning Street LotShape LandContour Utilities
0 0 0 0 0
LotConfig LandSlope Neighborhood Condition1 Condition2
0 0 0 0 0
BldgType HouseStyle RoofStyle RoofMatl Exterior1st
0 0 0 0 0
Exterior2nd MasVnrType ExterQual ExterCond Foundation
0 0 0 0 0
BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
0 0 0 0 0
Heating HeatingQC CentralAir Electrical KitchenQual
0 0 0 0 0
Functional FireplaceQu GarageType GarageFinish GarageQual
0 0 0 0 0
GarageCond PavedDrive SaleType SaleCondition
0 0 0 0
#############################
# Tratando os dados quantitativos
# visuaizando os dados
glimpse(test_quanti)
Rows: 1,459
Columns: 37
$ Id <int> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, …
$ MSSubClass <int> 20, 20, 60, 60, 120, 60, 20, 60, 20, 20, 120, 16…
$ LotFrontage <int> 80, 81, 74, 78, 43, 75, NA, 63, 85, 70, 26, 21, …
$ LotArea <int> 11622, 14267, 13830, 9978, 5005, 10000, 7980, 84…
$ OverallQual <int> 5, 6, 5, 6, 8, 6, 6, 6, 7, 4, 7, 6, 5, 6, 7, 9, …
$ OverallCond <int> 6, 6, 5, 6, 5, 5, 7, 5, 5, 5, 5, 5, 5, 6, 6, 5, …
$ YearBuilt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ YearRemodAdd <int> 1961, 1958, 1998, 1998, 1992, 1994, 2007, 1998, …
$ MasVnrArea <int> 0, 108, 0, 20, 0, 0, 0, 0, 0, 0, 0, 504, 492, 0,…
$ BsmtFinSF1 <int> 468, 923, 791, 602, 263, 0, 935, 0, 637, 804, 10…
$ BsmtFinSF2 <int> 144, 0, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, …
$ BsmtUnfSF <int> 270, 406, 137, 324, 1017, 763, 233, 789, 663, 0,…
$ TotalBsmtSF <int> 882, 1329, 928, 926, 1280, 763, 1168, 789, 1300,…
$ X1stFlrSF <int> 896, 1329, 928, 926, 1280, 763, 1187, 789, 1341,…
$ X2ndFlrSF <int> 0, 0, 701, 678, 0, 892, 0, 676, 0, 0, 0, 504, 56…
$ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ GrLivArea <int> 896, 1329, 1629, 1604, 1280, 1655, 1187, 1465, 1…
$ BsmtFullBath <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
$ BsmtHalfBath <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ FullBath <int> 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, …
$ HalfBath <int> 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, …
$ BedroomAbvGr <int> 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 3, …
$ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ TotRmsAbvGrd <int> 5, 6, 6, 7, 5, 7, 6, 7, 5, 4, 5, 5, 6, 6, 4, 10,…
$ Fireplaces <int> 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, …
$ GarageYrBlt <int> 1961, 1958, 1997, 1998, 1992, 1993, 1992, 1998, …
$ GarageCars <int> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 3, …
$ GarageArea <int> 730, 312, 482, 470, 506, 440, 420, 393, 506, 525…
$ WoodDeckSF <int> 140, 393, 212, 360, 0, 157, 483, 0, 192, 240, 20…
$ OpenPorchSF <int> 0, 36, 34, 36, 82, 84, 21, 75, 0, 0, 68, 0, 0, 0…
$ EnclosedPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ X3SsnPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch <int> 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ MiscVal <int> 0, 12500, 0, 0, 0, 0, 500, 0, 0, 0, 0, 0, 0, 0, …
$ MoSold <int> 6, 6, 3, 6, 1, 4, 3, 5, 2, 4, 6, 2, 3, 6, 6, 1, …
$ YrSold <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
# Conferindo os valores ausentes
colSums(is.na(test_quanti))
Id MSSubClass LotFrontage LotArea OverallQual
0 0 227 0 0
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
0 0 0 15 1
BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF
1 1 1 0 0
LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
0 0 2 2 0
HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
0 0 0 0 0
GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
78 1 1 0 0
EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
0 0 0 0 0
MoSold YrSold
0 0
# Ao todo temos 8 variaveis que apresentam valores ausentes.
test_quanti <- test_quanti %>%
mutate(LotFrontage=coalesce(LotFrontage,
mean(x =test_quanti$LotFrontage, na.rm = T )))
test_quanti <- test_quanti %>%
mutate(MasVnrArea=coalesce(MasVnrArea,
mean(x =test_quanti$MasVnrArea, na.rm = T )))
test_quanti <- test_quanti %>%
mutate(BsmtFinSF1 =coalesce(BsmtFinSF1 ,
mean(x =test_quanti$BsmtFinSF1 , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(BsmtFinSF2 =coalesce(BsmtFinSF2 ,
mean(x =test_quanti$BsmtFinSF2 , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(BsmtUnfSF =coalesce(BsmtUnfSF ,
mean(x =test_quanti$BsmtUnfSF , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(TotalBsmtSF =coalesce(TotalBsmtSF ,
mean(x =test_quanti$TotalBsmtSF , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(BsmtFullBath =coalesce(BsmtFullBath ,
mean(x =test_quanti$BsmtFullBath , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(BsmtHalfBath =coalesce(BsmtHalfBath ,
mean(x =test_quanti$BsmtHalfBath , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(GarageYrBlt =coalesce(GarageYrBlt ,
mean(x =test_quanti$GarageYrBlt , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(GarageCars =coalesce(GarageCars ,
mean(x =test_quanti$GarageCars , na.rm = T )))
test_quanti <- test_quanti %>%
mutate(GarageArea =coalesce(GarageArea ,
mean(x =test_quanti$GarageArea , na.rm = T )))
colSums(is.na(test_quanti))
Id MSSubClass LotFrontage LotArea OverallQual
0 0 0 0 0
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
0 0 0 0 0
BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF
0 0 0 0 0
LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
0 0 0 0 0
HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
0 0 0 0 0
GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
0 0 0 0 0
EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
0 0 0 0 0
MoSold YrSold
0 0
# Juntando os dataframes
test.clean <- cbind(test_quali,test_quanti)
Por fim, cabe agora conferir os dados de treino e teste tratados
colSums(is.na(train.clean))
colSums(is.na(test.clean))
Nesta etapa iniciaremos uma analise descritiva tanto das variáveis qualitativas como quantitativas do banco de dados de treino. Para os dados quantitativos abordaremos em primeira instancia uma investigação por estatistica decritiva univariada e depois bivariada para entender a sua relação com a variavel Y do problema que é a SalesPrice.
A estatistica univariada para contempla tabelas de ferequencia de ocorrencia o que já vimos anteriormente ao obtermos a moda, representação gráfica da distribuição e medidas de localização, dispersão ou variabilidade e medidas de forma(assimetria e curtosis).
A tabela de distribuição de frequências é calculada para cada valor discreto da variável. A frequência pode ser absoluta que informa o número de ocorrências de cada elemento i na amostra, pode ser uma frequência relativa que fornece a porcentagem % relativa à frequência absoluta, pode ser uma frequência acumulada que representa a soma de todos os elementos e por fim pode ser uma frequência relativa acumulada que é a frequencia relativa à acumulada.
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
Desta maneira criamos uma forma eficiente de exploramos o banco de dados para as variáveis categóricas. Como temos 39 variáveis exploratórias o melhor a fazer é criar um função que nos retorne essas três informações, a tabela de frequência, o grafico de barras verticais e grafico de barras empilhadas.
Então vamos colocar a mão na massa!!
resultado_HouseStyle
[[1]]
[[2]]
[[3]]
NA
A análise exploratória de dados quantitativos os principais gráficos que irão trazer informações importantes são de linha, dispersão, histograma e box plot.
Vamos então analisar o nosso banco de dados de treino que contem as variáveis quantitativas, e além de fazer essas visualizações também iremos inserir os gráficos de correlação e heatmap.
A nossa variável Y SalePrice, ou seja a que queremos prever, é qualitativa então vamos começar a análise por ela e sugerir posteriores correlações com as outras variáveis.
Usaremos o pacote e1071 para obtermos os gráficos de densidade e análisarmos as assimetrias da distribuição de algumas Variáveis.
# Olhando os nomes das variáveis esse bancod de dados quantitativos.
names(train_quanti)
[1] "Id" "MSSubClass" "LotFrontage" "LotArea" "OverallQual"
[6] "OverallCond" "YearBuilt" "YearRemodAdd" "MasVnrArea" "BsmtFinSF1"
[11] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "X1stFlrSF" "X2ndFlrSF"
[16] "LowQualFinSF" "GrLivArea" "BsmtFullBath" "BsmtHalfBath" "FullBath"
[21] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd" "Fireplaces"
[26] "GarageYrBlt" "GarageCars" "GarageArea" "WoodDeckSF" "OpenPorchSF"
[31] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
[36] "MoSold" "YrSold" "SalePrice"
# Extraindo o sumário estatistico
summary(train_quanti)
Id MSSubClass LotFrontage LotArea OverallQual
Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300 Min. : 1.000
1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 60.00 1st Qu.: 7554 1st Qu.: 5.000
Median : 730.5 Median : 50.0 Median : 70.05 Median : 9478 Median : 6.000
Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517 Mean : 6.099
3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 79.00 3rd Qu.: 11602 3rd Qu.: 7.000
Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245 Max. :10.000
OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
Min. :1.000 Min. :1872 Min. :1950 Min. : 0.0 Min. : 0.0
1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 1st Qu.: 0.0 1st Qu.: 0.0
Median :5.000 Median :1973 Median :1994 Median : 0.0 Median : 383.5
Mean :5.575 Mean :1971 Mean :1985 Mean : 103.7 Mean : 443.6
3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 3rd Qu.: 164.2 3rd Qu.: 712.2
Max. :9.000 Max. :2010 Max. :2010 Max. :1600.0 Max. :5644.0
BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF
Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 334 Min. : 0
1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0
Median : 0.00 Median : 477.5 Median : 991.5 Median :1087 Median : 0
Mean : 46.55 Mean : 567.2 Mean :1057.4 Mean :1163 Mean : 347
3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728
Max. :1474.00 Max. :2336.0 Max. :6110.0 Max. :4692 Max. :2065
LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
Median : 0.000 Median :1464 Median :0.0000 Median :0.00000 Median :2.000
Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000 Min. :0.000
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000 1st Qu.:0.000
Median :0.0000 Median :3.000 Median :1.000 Median : 6.000 Median :1.000
Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518 Mean :0.613
3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000 3rd Qu.:1.000
Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000 Max. :3.000
GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
Min. :1900 Min. :0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.:1962 1st Qu.:1.000 1st Qu.: 334.5 1st Qu.: 0.00 1st Qu.: 0.00
Median :1979 Median :2.000 Median : 480.0 Median : 0.00 Median : 25.00
Mean :1979 Mean :1.767 Mean : 473.0 Mean : 94.24 Mean : 46.66
3rd Qu.:2001 3rd Qu.:2.000 3rd Qu.: 576.0 3rd Qu.:168.00 3rd Qu.: 68.00
Max. :2010 Max. :4.000 Max. :1418.0 Max. :857.00 Max. :547.00
EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000 Median : 0.00
Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759 Mean : 43.49
3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00
Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000 Max. :15500.00
MoSold YrSold SalePrice
Min. : 1.000 Min. :2006 Min. : 34900
1st Qu.: 5.000 1st Qu.:2007 1st Qu.:129975
Median : 6.000 Median :2008 Median :163000
Mean : 6.322 Mean :2008 Mean :180921
3rd Qu.: 8.000 3rd Qu.:2009 3rd Qu.:214000
Max. :12.000 Max. :2010 Max. :755000
library(e1071)
par(mfrow=c(3,2))
# Preço do Imóvel
plot(density(train_quanti$SalePrice), main="Density Plot: Sale Price", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$SalePrice), 2)))
polygon(density(train_quanti$SalePrice), col="green")
# Número de quartos
plot(density(train_quanti$BedroomAbvGr), main="Density Plot: Bedroom", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$BedroomAbvGr), 2)))
polygon(density(train_quanti$BedroomAbvGr), col="orange")
# Números de Banheiros
plot(density(train_quanti$FullBath), main="Density Plot: Bathroom", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$FullBath), 2)))
polygon(density(train_quanti$FullBath), col="green")
# Área do terreno
plot(density(train_quanti$LotArea), main="Density Plot: Lot Area in feet", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$LotArea), 2)))
polygon(density(train_quanti$LotArea), col="orange")
# Ano de Construção
plot(density(train_quanti$YearBuilt), main="Density Plot: Year Built", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$YearBuilt), 2)))
polygon(density(train_quanti$YearBuilt), col="green")
# Área da garagem
plot(density(train_quanti$GarageArea), main="Density Plot: Garage Area", ylab="Frequency",
sub=paste("Assimetria:", round(e1071::skewness(train_quanti$GarageArea), 2)))
polygon(density(train_quanti$GarageArea), col="orange")
NA
NA
Vamos analisar os preços dos imóveis de acordo com o número de quartos, números de banheiros e vagas de garagem. Para isso vamos utilizar o boxplot e ver a variação dos valores e se ocorrem outliers para esses pares, a biblioteca ggplot auxiliará nessa etapa.
library(plotly)
lista=c("FullBath","BedroomAbvGr","GarageCars")
legenda=c("Valor do imovel pela numero de banheiros",
"Valor do imovel pelo numero de quartos",
"Valor do imovel pelo numero de vagas")
plot.boxes<-function(X,legenda){
ggplotly(
ggplot(train_quanti,aes_string(x=X, y="SalePrice",group=X, fill = X))+
geom_boxplot() + labs(title = legenda)+
theme_classic()
)
}
Map(plot.boxes,lista,legenda)
$FullBath
$BedroomAbvGr
$GarageCars
NA
De fato essas variáveis são bem interessante para analisarmos como elas se comportam com a variavel Y SalePrice. No entanto o mais importante agora é investigarmos as correlações entre as variáveis. Para isso vamos utilizar a correlação de Pearson que encontramos no pacote ggcorplot.
library(ggcorrplot)
corr <- round(cor(train_quanti), 1)
# Plot
ggplotly(
ggcorrplot(corr,
type = "lower",
lab = F,
lab_size = 5,
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of House Prices Kaggle Dataset",
ggtheme=theme_bw)
)
NA
Assim foi possivel observar quem possui maior correlação positiva ou negativa com a variável Sale Price. Vamos agora tentar roda uma regressão linear que possa se capaz de explicar o comportamento do preço do imóvel em relação as outras variáveis. Para isso vamos criar uma função